1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

All variables are derived from a laser scanning image of the eye background taken by the Heidelberg Retina Tomograph. Most of the variables describe either the area or volume in certain parts of the papilla and are measured in four sectors (temporal, superior, nasal and inferior) as well as for the whole papilla (global). The global measurement is, roughly, the sum of the measurements taken in the four sector.

The observations in both groups are matched by age and sex to prevent any bias.

Source Torsten Hothorn and Berthold Lausen (2003), Double-Bagging: Combining classifiers by bootstrap aggregation. Pattern Recognition, 36(6), 1303–1309.

1.2 The Data

GlaucomaM {TH.data}


data("GlaucomaM")

pander::pander(table(GlaucomaM$Class))
glaucoma normal
98 98

GlaucomaM$Class <- 1*(GlaucomaM$Class=="glaucoma")

1.2.0.1 Standarize the names for the reporting

studyName <- "GlaucomaM"
dataframe <- GlaucomaM
outcome <- "Class"
thro <- 0.80
TopVariables <- 10
cexheat = 0.25

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
196 62
pander::pander(table(dataframe[,outcome]))
0 1
98 98

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9961105

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  mr mv 
#>        ag        at        as        an        ai       eag 
#> 0.9838710 0.9516129 0.8870968 0.9677419 0.9193548 0.7096774 
#> 
#>  Included: 62 , Uni p: 0.002419355 , Base Size: 2 , Rcrit: 0.2004905 
#> 
#> 
 1 <R=0.996,thr=0.950>, Top: 10< 5 >[Fa= 10 ]( 10 , 16 , 0 ),<|><>Tot Used: 26 , Added: 16 , Zero Std: 0 , Max Cor: 0.946
#> 
 2 <R=0.946,thr=0.900>, Top: 7< 5 >[Fa= 10 ]( 7 , 14 , 10 ),<|><>Tot Used: 37 , Added: 14 , Zero Std: 0 , Max Cor: 0.892
#> 
 3 <R=0.892,thr=0.800>, Top: 11< 2 >[Fa= 17 ]( 11 , 14 , 10 ),<|><>Tot Used: 52 , Added: 14 , Zero Std: 0 , Max Cor: 0.799
#> 
 4 <R=0.799,thr=0.800>
#> 
 [ 4 ], 0.7689698 Decor Dimension: 52 Nused: 52 . Cor to Base: 29 , ABase: 62 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

3.03

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

0.678

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

3.94

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

3.74


varratio <- attr(DEdataframe,"VarRatio")

pander::pander(tail(varratio))
La_as La_ai La_vbss La_an La_at La_ag
0.0366 0.0366 0.0219 0.0212 0.0108 0.00776

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
  
  
  
}

1.5.2 Formulas Network

Displaying the features associations

par(op)
clustable <- c("To many variables")


  transform <- attr(DEdataframe,"UPLTM") != 0
  tnames <- colnames(transform)
  colnames(transform) <- str_remove_all(colnames(transform),"La_")
  transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
  
  
  fscore <- attr(DEdataframe,"fscore")
  VertexSize <- fscore # The size depends on the variable independence relevance (fscore)
  names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
  VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization

  VertexSize <- VertexSize[rownames(transform)]
  rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  
  ntop <- min(10,length(rsum))


  topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
  rtrans <- transform[topfeatures,]
  csum <- (apply(1*(rtrans !=0),2,sum) > 1*(colnames(rtrans) %in% topfeatures))
  rtrans <- rtrans[,csum]
  topfeatures <- unique(c(topfeatures,colnames(rtrans)))
  print(ncol(transform))

[1] 52

  transform <- transform[topfeatures,topfeatures]
  print(ncol(transform))

[1] 48

  if (ncol(transform)>100)
  {
    csum <- apply(1*(transform !=0),1,sum) 
    csum <- csum[csum > 1]
    csum <- csum + 0.01*VertexSize[names(csum)]
    csum <- csum[order(-csum)]
    tpsum <- min(20,length(csum))
    trsum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
    rtrans <- transform[trsum,]
    topfeatures <- unique(c(rownames(rtrans),colnames(rtrans)))
    transform <- transform[topfeatures,topfeatures]
    if (nrow(transform) > 150)
    {
      csum <- apply(1*(rtrans != 0 ),2,sum)
      csum <- csum + 0.01*VertexSize[names(csum)]
      csum <- csum[order(-csum)]
      tpsum <- min(130,length(csum))
      csum <- rownames(transform)[rownames(transform) %in% names(csum[1:tpsum])]
      csum <- unique(c(trsum,csum))
      transform <- transform[csum,csum]
    }
    print(ncol(transform))
  }

    if (ncol(transform) < 150)
    {

      gplots::heatmap.2(transform,
                        trace = "none",
                        mar = c(5,5),
                        col=rev(heat.colors(5)),
                        main = "Red Decorrelation matrix",
                        cexRow = cexheat,
                        cexCol = cexheat,
                       srtCol=45,
                       srtRow=45,
                        key.title=NA,
                        key.xlab="|Beta|>0",
                        xlab="Output Feature", ylab="Input Feature")
  
      par(op)
      VertexSize <- VertexSize[colnames(transform)]
      gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
      gr$layout <- layout_with_fr
      
      fc <- cluster_optimal(gr)
      plot(fc, gr,
           edge.width = 2*E(gr)$weight,
           vertex.size=VertexSize,
           edge.arrow.size=0.5,
           edge.arrow.width=0.5,
           vertex.label.cex=(0.15+0.05*VertexSize),
           vertex.label.dist=0.5 + 0.05*VertexSize,
           main="Top Feature Association")
      
      varratios <- varratio
      fscores <- fscore
      names(varratios) <- str_remove_all(names(varratios),"La_")
      names(fscores) <- str_remove_all(names(fscores),"La_")

      dc <- getLatentCoefficients(DEdataframe)
      theCharformulas <- attr(dc,"LatentCharFormulas")

      
      clustable <- as.data.frame(cbind(Variable=fc$names,
                                       Formula=as.character(theCharformulas[paste("La_",fc$names,sep="")]),
                                       Class=fc$membership,
                                       ResidualVariance=round(varratios[fc$names],3),
                                       Fscore=round(fscores[fc$names],3)
                                       )
                                 )
      rownames(clustable) <- str_replace_all(rownames(clustable),"__","_")
      clustable$Variable <- NULL
      clustable$Class <- as.integer(clustable$Class)
      clustable$ResidualVariance <- as.numeric(clustable$ResidualVariance)
      clustable$Fscore <- as.numeric(clustable$Fscore)
      clustable <- clustable[order(-clustable$Fscore),]
      clustable <- clustable[order(clustable$Class),]
      clustable <- clustable[clustable$Fscore >= -1,]
      topv <- min(50,nrow(clustable))
      clustable <- clustable[1:topv,]
    }


pander::pander(clustable)
  Formula Class ResidualVariance Fscore
mr NA 1 1.000 7
an + an - (1.956)mr 1 0.021 0
ag + ag - (5.849)mr 1 0.008 -1
as + as - (1.384)mr 1 0.037 -1
ai + ai - (1.407)mr 1 0.037 -1
eat + eat - (1.027)mr 1 0.161 -1
eas + eas - (1.374)mr 1 0.300 -1
vbsg + vbsg - (2.978)vbrs 2 0.204 5
mdic - (0.276)vbsg + mdic 2 0.170 0
vbst - (0.178)vbsg + vbst 2 0.167 0
vbsn - (0.292)vbsg + vbsn 2 0.122 -1
vbsi - (0.237)vbsg + vbsi 2 0.104 -1
vbrt - (0.947)vbst + vbrt 2 0.056 -1
vbri - (0.209)vbsg + vbri 2 0.176 -1
emd - (1.004)mdic + emd 2 0.060 -1
vbrs NA 3 1.000 4
vbrg + vbrg - (3.348)vbrs 3 0.037 0
vbrn - (1.106)vbrs + vbrn 3 0.174 -1
abrg + abrg - (5.498)vbrs 4 0.262 4
eag + eag - (0.845)abrg 4 0.156 0
ean - (0.378)eag + ean 4 0.093 -1
abrt - (0.165)abrg + abrt 4 0.177 -1
abrs - (0.246)abrg + abrs 4 0.082 -1
abrn - (0.346)abrg + abrn 4 0.094 -1
abri - (0.244)abrg + abri 4 0.086 -1
mdg NA 5 1.000 4
mdt - (0.865)mdg + mdt 5 0.155 -1
mds - (1.090)mdg + mds 5 0.063 -1
mdn - (1.066)mdg + mdn 5 0.259 -1
mdi - (0.924)mdg + mdi 5 0.126 -1
vasg NA 6 1.000 3
vass - (0.275)vasg + vass 6 0.154 -1
vasn - (0.531)vasg + vasn 6 0.061 -1
vasi - (0.177)vasg + vasi 6 0.323 -1
varg NA 7 1.000 3
vars - (0.264)varg + vars 7 0.133 -1
varn - (0.431)varg + varn 7 0.072 -1
vari - (0.270)varg + vari 7 0.139 -1
mhcs NA 8 1.000 2
mhcg + mhcg - (0.689)mhcs 8 0.330 -1
phcs - (0.877)mhcs + phcs 8 0.224 -1
tmg NA 9 1.000 2
tms - (1.217)tmg + tms 9 0.230 -1
tmi - (1.044)tmg + tmi 9 0.314 -1
mhct NA 10 1.000 1
phct - (0.845)mhct + phct 10 0.251 -1

par(op)

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after ILAA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.799364

1.8 U-MAP Visualization of features

1.8.1 The UMAP on Raw Data


  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  topvars <- univariate_BinEnsemble(dataframe,outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

hic, varg, vars, vari, tmi and phci

#  names(topvars)
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])

#}

1.8.2 The decorralted UMAP

  varlistcV <- names(varratio[varratio >= 0.01])
  topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe[,varlistcV],family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

hic, varg, phci, rnf, phcg and vbrs


  varlistcV <- varlistcV[varlistcV != outcome]
  
#  DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])

#}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")



univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
vari 0.04480 0.0368 0.1150 0.0543 0.1035 0.880
varg 0.17851 0.1170 0.4139 0.1967 0.0252 0.873
vars 0.04506 0.0314 0.1068 0.0596 0.0172 0.851
tmi 0.03664 0.1258 -0.1097 0.1038 0.6980 0.832
varn 0.08224 0.0595 0.1774 0.0894 0.2533 0.830
hic 0.39863 0.1407 0.2114 0.1574 0.7659 0.822
tmg -0.03356 0.0885 -0.1524 0.0927 0.5179 0.818
phcg -0.03546 0.0691 -0.1216 0.0661 0.9359 0.818
phci 0.00898 0.0882 -0.0937 0.0779 0.7638 0.814
tms 0.03959 0.1166 -0.1192 0.1376 0.9756 0.808


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
varg 0.17851 0.11700 0.4139 0.1967 2.52e-02 0.873
hic 0.39863 0.14073 0.2114 0.1574 7.66e-01 0.822
tmg -0.03356 0.08846 -0.1524 0.0927 5.18e-01 0.818
phcg -0.03546 0.06909 -0.1216 0.0661 9.36e-01 0.818
phci 0.00898 0.08818 -0.0937 0.0779 7.64e-01 0.814
rnf 0.13956 0.06764 0.2252 0.0975 2.25e-01 0.800
phcn 0.00692 0.07364 -0.0717 0.0881 3.03e-01 0.796
vart 0.00640 0.00533 0.0146 0.0117 7.54e-04 0.777
vbrs 0.16142 0.09970 0.0861 0.1319 4.75e-06 0.773
La_mdic 0.07786 0.05356 0.0351 0.0380 1.36e-01 0.766
La_vbrt -0.02477 0.01482 -0.0345 0.0204 2.83e-01 0.713
La_as -0.61158 0.02849 -0.5903 0.0285 8.42e-01 0.709
La_vbri -0.01438 0.03870 -0.0394 0.0364 8.08e-02 0.698
La_eas -0.72443 0.05906 -0.7891 0.1222 4.93e-03 0.697

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.07 42 0.677

theCharformulas <- attr(dc,"LatentCharFormulas")

topvar <- rownames(tableRaw)
finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]
finalTable$varratio <- varratio[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores","varratio")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores varratio
vari NA 0.04480 0.03683 0.1150 0.0543 1.04e-01 0.880 0.880 NA NA
varg NA 0.17851 0.11700 0.4139 0.1967 2.52e-02 0.873 0.873 3 1.0000
vars NA 0.04506 0.03143 0.1068 0.0596 1.72e-02 0.851 0.851 NA NA
tmi NA 0.03664 0.12578 -0.1097 0.1038 6.98e-01 0.832 0.832 NA NA
varn NA 0.08224 0.05953 0.1774 0.0894 2.53e-01 0.830 0.830 NA NA
hic NA 0.39863 0.14073 0.2114 0.1574 7.66e-01 0.822 0.822 0 1.0000
tmg NA -0.03356 0.08846 -0.1524 0.0927 5.18e-01 0.818 0.818 2 1.0000
phcg NA -0.03546 0.06909 -0.1216 0.0661 9.36e-01 0.818 0.818 0 1.0000
phci NA 0.00898 0.08818 -0.0937 0.0779 7.64e-01 0.814 0.814 1 1.0000
tms NA 0.03959 0.11659 -0.1192 0.1376 9.76e-01 0.808 0.808 NA NA
rnf NA 0.13956 0.06764 0.2252 0.0975 2.25e-01 0.800 0.800 0 1.0000
phcn NA 0.00692 0.07364 -0.0717 0.0881 3.03e-01 0.796 0.796 1 1.0000
vart NA 0.00640 0.00533 0.0146 0.0117 7.54e-04 0.777 0.777 0 1.0000
vbrs NA 0.16142 0.09970 0.0861 0.1319 4.75e-06 0.773 0.773 4 1.0000
La_mdic - (0.276)vbsg + mdic 0.07786 0.05356 0.0351 0.0380 1.36e-01 0.766 0.785 0 0.1696
La_vbrt - (0.947)vbst + vbrt -0.02477 0.01482 -0.0345 0.0204 2.83e-01 0.713 0.709 -1 0.0562
La_as + as - (1.384)mr -0.61158 0.02849 -0.5903 0.0285 8.42e-01 0.709 0.501 -1 0.0366
La_vbri - (0.209)vbsg + vbri -0.01438 0.03870 -0.0394 0.0364 8.08e-02 0.698 0.803 -1 0.1759
La_eas + eas - (1.374)mr -0.72443 0.05906 -0.7891 0.1222 4.93e-03 0.697 0.621 -1 0.3001

1.10 Comparing ILAA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE,tol=0.01)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)-1)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 83 15
1 11 87
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.867 0.812 0.911
3 se 0.888 0.808 0.943
4 sp 0.847 0.760 0.912
6 diag.or 43.764 19.005 100.778

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 89 9
1 10 88
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.903 0.853 0.941
3 se 0.898 0.820 0.950
4 sp 0.908 0.833 0.957
6 diag.or 87.022 33.739 224.456

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 85 13
1 20 78
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.832 0.772 0.881
3 se 0.796 0.703 0.871
4 sp 0.867 0.784 0.927
6 diag.or 25.500 11.891 54.684


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 88 10
1 16 82
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.867 0.812 0.911
3 se 0.837 0.748 0.904
4 sp 0.898 0.820 0.950
6 diag.or 45.100 19.365 105.036
  par(op)